from IPython.display import HTML
HTML("""
<!-- ---------- Aesthetic “Hide / Show code” button ------------ -->
<style>
/* Fancy toggle button */
#toggle-code-btn{
position:fixed;
top:15px;
right:15px;
z-index:9999;
padding:10px 22px;
background:linear-gradient(135deg,#ff6637,#ff3366);
color:#ffffff;
font-family:"Segoe UI",sans-serif;
font-size:14px;
letter-spacing:.4px;
border:none;
border-radius:30px;
cursor:pointer;
box-shadow:0 4px 8px rgba(0,0,0,.15);
transition:all .2s ease;
}
#toggle-code-btn:hover{
box-shadow:0 6px 14px rgba(0,0,0,.25);
transform:translateY(-2px);
}
#toggle-code-btn:active{
box-shadow:0 3px 6px rgba(0,0,0,.18);
transform:translateY(0);
}
</style>
<script src="https://code.jquery.com/jquery-3.6.4.min.js"></script>
<script>
function toggleCode(){
/* Classic Notebook */
$('div.input').toggle();
/* JupyterLab + nbconvert HTML */
$('.jp-InputArea, .jp-CodeCell .jp-Cell-inputWrapper').toggle();
}
$(document).ready(function(){
/* Inject button only once */
if(!$('#toggle-code-btn').length){
$('body').append('<button id="toggle-code-btn">Hide code</button>');
$('#toggle-code-btn').on('click',function(){
toggleCode();
/* Swap label */
$(this).text(
$(this).text().startsWith('Hide') ? 'Show code' : 'Hide code'
);
});
}
});
</script>
""")
import os
import glob
import gc
import warnings
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import (
r2_score,
mean_squared_error,
median_absolute_error,
)
from scipy.stats import skew, kurtosis
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.tsa.stattools import adfuller, acf
from arch import arch_model
# # Save the combined DataFrame to CSV
# combined_path = "C:/Users/ryatu/Documents/optiver_data/Optiver_additional data/order_book_combined.csv"
# df_combined.to_csv(combined_path, index=False)
# print(f"Combined data saved to {combined_path}")
# Load the previously saved combined DataFrame
df_combined = pd.read_csv("C:/Users/ryatu/Documents/optiver_data/Optiver_additional data/order_book_combined.csv")
df_combined.head()
| stock_id | time_id | seconds_in_bucket | bid_price1 | ask_price1 | bid_price2 | ask_price2 | bid_size1 | ask_size1 | bid_size2 | ask_size2 | wap | BidAskSpread | spread | log_return | realized_volatility | time_bucket | imbalance | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 8382 | 6 | 1800.0 | 740.03 | 740.29 | 740.0 | 740.30 | 6 | 6 | 800 | 40 | 740.160000 | 0.000351 | 0.26 | 0.000000 | 0.011885 | 60 | 0.000000 |
| 1 | 8382 | 6 | 1800.0 | 740.03 | 740.29 | 740.0 | 740.30 | 6 | 6 | 800 | 40 | 740.160000 | 0.000351 | 0.26 | 0.000000 | 0.011885 | 60 | 0.000000 |
| 2 | 8382 | 6 | 1801.0 | 740.05 | 740.29 | 740.0 | 740.30 | 25 | 1 | 99 | 40 | 740.280769 | 0.000324 | 0.24 | 0.000163 | 0.011885 | 61 | 0.923077 |
| 3 | 8382 | 6 | 1801.0 | 740.05 | 740.29 | 740.0 | 740.30 | 25 | 1 | 99 | 40 | 740.280769 | 0.000324 | 0.24 | 0.000000 | 0.011885 | 61 | 0.923077 |
| 4 | 8382 | 6 | 1802.0 | 740.06 | 740.36 | 740.0 | 740.39 | 100 | 30 | 399 | 4 | 740.290769 | 0.000405 | 0.30 | 0.000014 | 0.011885 | 61 | 0.538462 |
Main: additional stocks into df¶
change this : this is for the feature / target csv
Information about the contents of Optiver_additional_data.zip
"order_book_feature.csv" and "order_book_target.csv" contain information on order book snapshots, like the dataset we are using now. The parquet files store the same dataset indicated by the file name.
"stock_ids.csv" contains the names of stocks.
"time_id_reference.csv" contains the real-time of each time id.
"trades.csv" contains stock transaction information, including the stock price and size.
"train.csv" contains the "target", which is the actual volatility for a 30-minute window.
Some remarks:
Time id is sequential now and 1 time id represents 1 hour.
You need to combine two order book files together to do further study. With the same time id, the "feature" contains the information for the first 30 min (seconds_in_bucket from 0 - 1799), and the "target" contains the information for the last 30 min (seconds_in_bucket from 1800 - 3599).
"trades.csv" is arguably less useful for this project RE: calculating realized volatility from log-returns (the order book files would be preferrable here), but may be an interesting feature as it can indicate market liquidity.
The "target" in "train.csv" is the volatility for the last 30 minutes in one hour, so it can be calculated from the "order_book_target" file directly. You can ignore this file too.
# import pandas as pd
# stock_0 = pd.read_parquet("C:/Users/ryatu/Documents/optiver_data/individual_book_train/stock_0.parquet")
# # stock_0
# csv_folder = "C:/Users/ryatu/Documents/optiver_data/individual_book_train"
# parquet_folder = "C:/Users/ryatu/Documents/optiver_data/individual_book_train"
# for i in range(10): # adjust range if you have more files
# csv_file = os.path.join(csv_folder, f"stock_{i}.csv")
# parquet_file = os.path.join(parquet_folder, f"stock_{i}.parquet")
# if os.path.exists(csv_file):
# df = pd.read_csv(csv_file) # ← read the CSV first
# df.to_parquet(parquet_file) # ← save it as Parquet
# print(f"Converted: stock_{i}.csv → stock_{i}.parquet")
# else:
# print(f"File not found: stock_{i}.csv (skipped)")
# # features: Order book snapshots for the first 30 minutes of each 1-hour time window (seconds_in_bucket 0–1799).
# feature = pd.read_csv("C:/Users/ryatu/Documents/optiver_data/Optiver_additional data/order_book_feature.csv", sep='\t')
# # features = pd.read_parquet(
# # r"C:/Users/ryatu/Documents/optiver_data/Optiver_additional data/order_book_feature.parquet"
# # )
# # filter_feature = feature[1790:1800]
# # filter_feature
# # targets: Order book snapshots for the last 30 minutes of each 1-hour time window (seconds_in_bucket 1800–3599).
# target = pd.read_csv("C:/Users/ryatu/Documents/optiver_data/Optiver_additional data/order_book_target.csv", sep='\t')
# # target = pd.read_parquet(
# # r"C:/Users/ryatu/Documents/optiver_data/Optiver_additional data/order_book_target.parquet"
# # )
# # m 1800 - 3599).
# # Stack them: feature first, then target
# df_combined = pd.concat([feature, target], ignore_index=True)
# # Optional: Sort if you want by stock_id, time_id, and seconds_in_bucket
# df_combined = df_combined.sort_values(by=['stock_id', 'time_id', 'seconds_in_bucket']).reset_index(drop=True)
# ## joining / merge by sorting uni identifiers 'seconds_in_buckets' determines the sequence
Addtional Information file: load all into df functions to add on¶
def compute_wap_and_spread(df):
# Compute WAP (Weighted Average Price)
denom = df["bid_size1"] + df["ask_size1"]
df["wap"] = np.where(denom != 0,
(df["bid_price1"] * df["ask_size1"] + df["ask_price1"] * df["bid_size1"]) / denom,
np.nan)
# Calculate Bid-Ask Spread
df["BidAskSpread"] = np.where(df["bid_price1"] != 0,
df["ask_price1"] / df["bid_price1"] - 1,
np.nan)
# Calculate the spread between ask and bid prices
df["spread"] = df["ask_price1"] - df["bid_price1"]
# Handle NaN values in 'wap', 'BidAskSpread', and 'spread'
df['wap'] = df['wap'].fillna(method='ffill').fillna(method='bfill') # Fill forward and backward
df['BidAskSpread'] = df['BidAskSpread'].fillna(0) # Fill with 0 as no spread can exist
df['spread'] = df['spread'].fillna(0) # Same for spread
return df
def compute_log_return(df):
print(" 1/3 sorting data by tock id , time_id, seconds_in_bucket")
df = df.sort_values(['stock_id', 'time_id', 'seconds_in_bucket'])
print(" 2/3 computing log returns grouped by stock_id and time_id")
df['log_return'] = (
df.groupby(['stock_id', 'time_id'])['wap']
.transform(lambda x: np.log(x / x.shift(1)))
)
print("[3/3] Filling NaNs in log_return with 0...")
df['log_return'] = df['log_return'].fillna(0)
return df
def assign_time_buckets(df):
df['time_bucket'] = np.ceil(df['seconds_in_bucket'] / 30).astype(int)
return df
def compute_realized_volatility(df):
print("[1/5] Checking necessary columns...")
required_cols = ['stock_id', 'time_id', 'seconds_in_bucket', 'log_return']
missing_cols = [col for col in required_cols if col not in df.columns]
if missing_cols:
raise ValueError(f"Missing columns: {missing_cols}")
print("[2/5] Sorting by stock_id, time_id, seconds_in_bucket...")
df = df.sort_values(['stock_id', 'time_id', 'seconds_in_bucket'], kind='mergesort')
print("[3/5] Computing realized volatility per group (vectorized)...")
vol = (
df.groupby(["stock_id", "time_id"])['log_return']
.agg(lambda x: np.sqrt((x**2).sum()))
.reset_index()
.rename(columns={"log_return": "realized_volatility"})
)
print("[4/5] Merging realized volatility back to original df...")
df = df.merge(vol, on=["stock_id", "time_id"], how="left")
print("✅ Done computing realized volatility!")
return df
def compute_imbalance(df):
total_volume = df['bid_size1'] + df['ask_size1']
df['imbalance'] = (df['bid_size1'] - df['ask_size1']) / total_volume.replace(0, np.nan)
return df
# def global_seconds(df):
# time_id_mapping = {tid: i for i, tid in enumerate(sorted(df['time_id'].unique()))}
# df['global_seconds'] = df['seconds_in_bucket'] + df['time_id'].map(time_id_mapping) * 600
# return df
# def fill_missing_seconds(df):
# print("[1/5] Loading data with Dask to avoid memory issues...")
# # Converting the pandas DataFrame to Dask DataFrame to handle large datasets
# df = dd.from_pandas(df, npartitions=8) # Adjust the number of partitions based on your system
# print("[2/5] Preparing static seconds template...")
# # Create a static template for 'seconds_in_bucket'
# seconds_template = pd.DataFrame({'seconds_in_bucket': np.arange(600)})
# seconds_template = dd.from_pandas(seconds_template, npartitions=1) # Convert seconds template to Dask
# print("[3/5] Grouping by stock_id and time_id...")
# grouped = df.groupby(['stock_id', 'time_id'], sort=False) # Group by stock_id and time_id without sorting
# filled_list = []
# # Identify only the columns that need forward filling (exclude ID columns)
# ffill_cols = [col for col in df.columns if col not in ['stock_id', 'time_id', 'seconds_in_bucket']]
# print("[4/5] Filling missing seconds in groups...")
# # Use tqdm to iterate over the groups and track progress
# for stock_id, time_id, group in tqdm(grouped, desc="Filling seconds"):
# # Processing each group and filling seconds
# group = group.set_index('seconds_in_bucket')
# full_group = seconds_template.join(group, how='left')
# full_group['stock_id'] = stock_id
# full_group['time_id'] = time_id
# # Forward fill the trading features
# if ffill_cols:
# full_group[ffill_cols] = full_group[ffill_cols].ffill()
# # Drop rows where all feature columns are NaN
# full_group = full_group.dropna(how='all', subset=ffill_cols)
# # Convert Dask DataFrame back to Pandas for further concatenation
# filled_list.append(full_group.compute())
# # Concatenate the filled groups into the final DataFrame
# df_filled = pd.concat(filled_list, ignore_index=True)
# print("✅ Done filling missing seconds!")
# return df_filled
def process_stock_dataframe_fast(df, stock_id=None):
try:
if stock_id is not None:
df['stock_id'] = stock_id
# # Step 1: Compute log return FIRST
# df = compute_wap_and_spread(df)
# df = compute_log_return(df)
# df = compute_realized_volatility(df)
# df = assign_time_buckets(df)
# df = compute_wap_and_spread(df)
# df = compute_imbalance(df)
# Step 2: Now check if 'log_return' exists (it must exist now)
if 'log_return' not in df.columns:
print(f"Skipping stock {stock_id}: 'log_return' column missing after computation")
return None
# Drop rows where 'log_return' is missing
df = df.dropna(subset=["log_return"])
# If after dropping, df is empty, skip
if df.empty:
print(f"Skipping stock {stock_id}: no valid log_return data")
return None
# Step 3: Compute simple summary statistics
avg_vol = df['realized_volatility'].mean()
std_vol = df['realized_volatility'].std()
avg_spread = df['spread'].mean()
avg_imbalance = df['imbalance'].mean()
return {
'stock_id': df['stock_id'].iloc[0],
'avg_vol': avg_vol,
'std_vol': std_vol,
'avg_spread': avg_spread,
'avg_imbalance': avg_imbalance
}
except Exception as e:
print(f"Error processing stock {stock_id}: {e}")
return None
## processing avg log return, std log return, avg bid-ask spread, avg_ imbalance(bid+ask price)/bid+ask + global second first 10
results = []
for stock_id, group_df in df_combined.groupby('stock_id'):
res = process_stock_dataframe_fast(group_df, stock_id=stock_id)
if res is not None:
results.append(res)
summary_df = pd.DataFrame(results)
print(summary_df.head())
stock_id avg_vol std_vol avg_spread avg_imbalance 0 8382 0.010143 0.004906 0.189279 -0.024330 1 9323 0.004152 0.002040 0.011223 0.023677 2 22675 0.007071 0.002657 1.038275 -0.015388 3 22729 0.008430 0.003536 1.104877 0.014373 4 22753 0.004398 0.001698 0.018901 0.047418
summary_df
| stock_id | avg_vol | std_vol | avg_spread | avg_imbalance | |
|---|---|---|---|---|---|
| 0 | 8382 | 0.010143 | 0.004906 | 0.189279 | -0.024330 |
| 1 | 9323 | 0.004152 | 0.002040 | 0.011223 | 0.023677 |
| 2 | 22675 | 0.007071 | 0.002657 | 1.038275 | -0.015388 |
| 3 | 22729 | 0.008430 | 0.003536 | 1.104877 | 0.014373 |
| 4 | 22753 | 0.004398 | 0.001698 | 0.018901 | 0.047418 |
| 5 | 22771 | 0.007824 | 0.003216 | 0.206130 | 0.019420 |
| 6 | 22951 | 0.005721 | 0.002309 | 0.053346 | 0.023017 |
| 7 | 48219 | 0.008789 | 0.003510 | 0.164902 | 0.022281 |
| 8 | 50200 | 0.002054 | 0.001168 | 0.012243 | -0.013586 |
| 9 | 104919 | 0.002951 | 0.001572 | 0.013816 | -0.023676 |
1 Compute log_return, mean, std Core features: growth + volatility¶
print(df_combined.columns)
Index(['stock_id', 'time_id', 'seconds_in_bucket', 'bid_price1', 'ask_price1',
'bid_price2', 'ask_price2', 'bid_size1', 'ask_size1', 'bid_size2',
'ask_size2', 'wap', 'BidAskSpread', 'spread', 'log_return',
'realized_volatility', 'time_bucket', 'imbalance'],
dtype='object')
# Inspect the first few rows
print(df_combined.head())
stock_id time_id seconds_in_bucket bid_price1 ask_price1 bid_price2 \ 0 8382 6 1800.0 740.03 740.29 740.0 1 8382 6 1800.0 740.03 740.29 740.0 2 8382 6 1801.0 740.05 740.29 740.0 3 8382 6 1801.0 740.05 740.29 740.0 4 8382 6 1802.0 740.06 740.36 740.0 ask_price2 bid_size1 ask_size1 bid_size2 ask_size2 wap \ 0 740.30 6 6 800 40 740.160000 1 740.30 6 6 800 40 740.160000 2 740.30 25 1 99 40 740.280769 3 740.30 25 1 99 40 740.280769 4 740.39 100 30 399 4 740.290769 BidAskSpread spread log_return realized_volatility time_bucket \ 0 0.000351 0.26 0.000000 0.011885 60 1 0.000351 0.26 0.000000 0.011885 60 2 0.000324 0.24 0.000163 0.011885 61 3 0.000324 0.24 0.000000 0.011885 61 4 0.000405 0.30 0.000014 0.011885 61 imbalance 0 0.000000 1 0.000000 2 0.923077 3 0.923077 4 0.538462
# Summary statistics for numerical columns
print(df_combined.describe())
stock_id time_id seconds_in_bucket bid_price1 \
count 3.555745e+07 3.555745e+07 3.555745e+07 3.555745e+07
mean 3.427864e+04 5.989761e+02 1.802904e+03 7.893693e+02
std 2.892329e+04 3.448613e+02 1.041338e+03 9.837921e+02
min 8.382000e+03 6.000000e+00 0.000000e+00 1.162100e+02
25% 2.267500e+04 3.010000e+02 9.010000e+02 2.759000e+02
50% 2.275300e+04 5.890000e+02 1.812000e+03 3.786500e+02
75% 4.821900e+04 8.980000e+02 2.705000e+03 6.523300e+02
max 1.049190e+05 1.199000e+03 3.599000e+03 3.772150e+03
ask_price1 bid_price2 ask_price2 bid_size1 ask_size1 \
count 3.555745e+07 3.555745e+07 3.555745e+07 3.555745e+07 3.555745e+07
mean 7.896112e+02 7.893253e+02 7.896546e+02 3.362972e+02 3.428978e+02
std 9.841550e+02 9.837394e+02 9.842071e+02 8.818183e+02 1.537778e+03
min 1.162200e+02 1.162000e+02 1.162300e+02 1.000000e+00 1.000000e+00
25% 2.759900e+02 2.758600e+02 2.760200e+02 2.500000e+01 2.500000e+01
50% 3.787000e+02 3.786200e+02 3.787200e+02 1.040000e+02 1.010000e+02
75% 6.525400e+02 6.523000e+02 6.525800e+02 4.000000e+02 4.000000e+02
max 3.773200e+03 3.772040e+03 3.773360e+03 6.055080e+05 5.052590e+05
bid_size2 ask_size2 wap BidAskSpread spread \
count 3.555745e+07 3.555745e+07 3.555745e+07 3.555745e+07 3.555745e+07
mean 5.322832e+02 5.438523e+02 7.894899e+02 2.109644e-04 2.418595e-01
std 1.170299e+03 1.990312e+03 9.839723e+02 1.998765e-04 4.230123e-01
min 1.000000e+00 1.000000e+00 1.162177e+02 2.894448e-06 1.000000e-02
25% 2.500000e+01 2.500000e+01 2.759501e+02 6.379992e-05 1.000000e-02
50% 1.250000e+02 1.210000e+02 3.786750e+02 1.478962e-04 4.000000e-02
75% 8.740000e+02 8.870000e+02 6.524312e+02 3.053901e-04 2.100000e-01
max 6.791250e+05 1.049878e+06 3.772909e+03 6.343757e-03 6.420000e+00
log_return realized_volatility time_bucket imbalance
count 3.555745e+07 3.555745e+07 3.555745e+07 3.555745e+07
mean 1.929972e-09 5.854340e-03 6.058256e+01 6.115131e-03
std 1.278304e-04 3.828251e-03 3.471034e+01 6.001824e-01
min -5.580854e-03 5.202128e-04 0.000000e+00 -9.999874e-01
25% -2.717131e-05 3.117310e-03 3.100000e+01 -5.006242e-01
50% 0.000000e+00 5.105796e-03 6.100000e+01 0.000000e+00
75% 2.791742e-05 7.643046e-03 9.100000e+01 5.191763e-01
max 1.107354e-02 4.379540e-02 1.200000e+02 9.999869e-01
trades = pd.read_csv("C:/Users/ryatu/Documents/optiver_data/Optiver_additional data/trades.csv", sep='\t')
# This is optional: Trade-level data including prices and volumes. Useful for liquidity or additional features, but not necessary for computing realized volatility.
trades
# Actual executed trades: price, size, order count
# Analyze liquidity, volume spikes, microstructure noise
| time_id | stock_id | seconds_in_bucket | price | size | order_count | |
|---|---|---|---|---|---|---|
| 0 | 12 | 8382 | 1.0 | 721.994534 | 8157.0 | 74.0 |
| 1 | 12 | 8382 | 2.0 | 722.612764 | 492.0 | 8.0 |
| 2 | 12 | 8382 | 3.0 | 722.380000 | 25.0 | 1.0 |
| 3 | 12 | 8382 | 4.0 | 722.717397 | 292.0 | 10.0 |
| 4 | 12 | 8382 | 6.0 | 722.521562 | 128.0 | 3.0 |
| ... | ... | ... | ... | ... | ... | ... |
| 6853530 | 1199 | 104919 | 1792.0 | 364.150000 | 242.0 | 3.0 |
| 6853531 | 1199 | 104919 | 1794.0 | 364.130012 | 801.0 | 7.0 |
| 6853532 | 1199 | 104919 | 1795.0 | 364.131000 | 1000.0 | 3.0 |
| 6853533 | 1199 | 104919 | 1796.0 | 364.139231 | 1300.0 | 5.0 |
| 6853534 | 1199 | 104919 | 1798.0 | 364.130000 | 1000.0 | 3.0 |
6853535 rows × 6 columns
Realized Volatility Estimation Compute RV and
Compare your computed volatility vs train.csv target volatility
Look for systematic biases (e.g., is your volatility over- or under-estimating?)
print(trades[trades['stock_id'] == 8382])
time_id stock_id seconds_in_bucket price size order_count 0 12 8382 1.0 721.994534 8157.0 74.0 1 12 8382 2.0 722.612764 492.0 8.0 2 12 8382 3.0 722.380000 25.0 1.0 3 12 8382 4.0 722.717397 292.0 10.0 4 12 8382 6.0 722.521562 128.0 3.0 ... ... ... ... ... ... ... 1021822 1199 8382 1790.0 796.559709 103.0 3.0 1021823 1199 8382 1792.0 796.713556 90.0 3.0 1021824 1199 8382 1793.0 796.688503 147.0 2.0 1021825 1199 8382 1794.0 796.670000 189.0 2.0 1021826 1199 8382 1799.0 796.730000 20.0 1.0 [1021827 rows x 6 columns]
trades['stock_id'].unique()
array([ 8382, 9323, 22675, 22729, 22753, 22771, 22951, 48219,
50200, 104919], dtype=int64)
print(trades['stock_id'].unique())
[ 8382 9323 22675 22729 22753 22771 22951 48219 50200 104919]
time_id_reference = pd.read_csv("C:/Users/ryatu/Documents/optiver_data/Optiver_additional data/time_id_reference.csv")
# Maps time_id to actual timestamps.
# chrono order in time across all stocks ; is there a correlation between daytime performance v nighttime
time_id_reference
| date | time | time_id | |
|---|---|---|---|
| 0 | 2021-01-05 | 11:00:00 | 12 |
| 1 | 2021-01-05 | 12:00:00 | 13 |
| 2 | 2021-01-05 | 13:00:00 | 14 |
| 3 | 2021-01-05 | 14:00:00 | 15 |
| 4 | 2021-01-05 | 15:00:00 | 16 |
| ... | ... | ... | ... |
| 1183 | 2021-10-07 | 12:00:00 | 1195 |
| 1184 | 2021-10-07 | 13:00:00 | 1196 |
| 1185 | 2021-10-07 | 14:00:00 | 1197 |
| 1186 | 2021-10-07 | 15:00:00 | 1198 |
| 1187 | 2021-10-07 | 16:00:00 | 1199 |
1188 rows × 3 columns
train = pd.read_csv("C:/Users/ryatu/Documents/optiver_data/Optiver_additional data/train.csv")
# Contains precomputed volatility targets for the last 30 minutes of each hour. Can be derived from order_book_target.
train
| time_id | stock_id | target | |
|---|---|---|---|
| 0 | 12 | 8382 | 0.008714 |
| 1 | 12 | 9323 | 0.004731 |
| 2 | 12 | 22675 | 0.007892 |
| 3 | 12 | 22729 | 0.007427 |
| 4 | 12 | 22753 | 0.005196 |
| ... | ... | ... | ... |
| 11515 | 1199 | 22771 | 0.004278 |
| 11516 | 1199 | 22951 | 0.003204 |
| 11517 | 1199 | 48219 | 0.005806 |
| 11518 | 1199 | 50200 | 0.002002 |
| 11519 | 1199 | 104919 | 0.002154 |
11520 rows × 3 columns
Step¶
1 Compute log_return, mean, std Core features: growth + volatility all method block 2 Compute realized volatility Compare to train.csv targets
3 Analyze trade liquidity Find periods of illiquidity / risk
4 Plot mean vs std per stock Find best growth/risk tradeoffs 5 Check volatility by time of day Discover structural market patterns
6 Build early models Predict volatility from early window (pending)
Provided target volatility (realized vol) compare our method to the target vola
2 Compute realized volatility Compare to train.csv targets¶
- Benchmark your calculated volatility against "truth"
- computed volatility vs train.csv
- looking for systematic biases == is our computed RV over or under estimated
myRV = df_combined[['stock_id', 'time_id', 'realized_volatility']].drop_duplicates() # dropping dup = multiple seconds for each time id
# merge with train csv based on stock_id , time_id
compare = myRV.merge(train, on=['stock_id', 'time_id'], how='inner')
compare
| stock_id | time_id | realized_volatility | target | |
|---|---|---|---|---|
| 0 | 8382 | 12 | 0.014113 | 0.008714 |
| 1 | 8382 | 13 | 0.010415 | 0.006838 |
| 2 | 8382 | 14 | 0.008435 | 0.005098 |
| 3 | 8382 | 15 | 0.006624 | 0.004095 |
| 4 | 8382 | 16 | 0.006230 | 0.004646 |
| ... | ... | ... | ... | ... |
| 11515 | 104919 | 1195 | 0.001839 | 0.001092 |
| 11516 | 104919 | 1196 | 0.001595 | 0.001147 |
| 11517 | 104919 | 1197 | 0.001546 | 0.001124 |
| 11518 | 104919 | 1198 | 0.002143 | 0.001499 |
| 11519 | 104919 | 1199 | 0.002619 | 0.002154 |
11520 rows × 4 columns
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
# Add a column for display labels (shown on hover)
compare['label'] = compare.apply(
lambda row: f"Stock: {int(row['stock_id'])}<br>Time ID: {int(row['time_id'])}<br>Target: {row['target']:.5f}<br>Computed: {row['realized_volatility']:.5f}",
axis=1
)
# Create interactive scatter plot
fig = px.scatter(
compare,
x='target',
y='realized_volatility',
color='stock_id',
hover_name='label',
labels={
"target": "Target Volatility",
"realized_volatility": "Your Computed Volatility",
"stock_id": "Stock ID"
},
title=" Realized Volatility Comparison train['target'] v Compute_realized_volatility(df) (Hover for Details)"
)
# Add reference y = x line
fig.add_shape(
type="line",
x0=0, y0=0, x1=0.04, y1=0.04,
line=dict(color="red", dash="dash"),
)
fig.update_layout(
width=900,
height=600,
showlegend=False
)
fig.show()
# Error analysis
compare['error'] = compare['realized_volatility'] - compare['target']
print("Mean Error:", compare['error'].mean())
print("Mean Absolute Error:", compare['error'].abs().mean())
Mean Error: 0.001912057880135775 Mean Absolute Error: 0.001912057880135775
this scatter shows that our model is pretty well "calibrated- most dots are close to the red line
although MSE is postive == meaning slight overestimating volatility on average
a few high outliers might be stocks with large jumps or thin liquidity
def growth_vs_volatility(df):
"""
Compute mean log return and realized volatility, then create an interactive Plotly scatter plot
to visualize Growth vs Volatility Tradeoff.
"""
print("[1/4] Checking if 'log_return' exists...")
if 'log_return' not in df.columns:
raise ValueError("You must compute 'log_return' first!")
print("[2/4] Grouping by stock_id and time_id to compute mean log return and realized volatility...")
# Mean log return
mean_return = (
df.groupby(['stock_id', 'time_id'])['log_return']
.mean()
.reset_index()
.rename(columns={'log_return': 'mean_log_return'})
)
# Realized volatility (already computed, else recompute here)
if 'realized_volatility' not in df.columns:
print("Realized volatility not found, computing now...")
realized_vol = (
df.groupby(['stock_id', 'time_id'])['log_return']
.agg(lambda x: (x**2).sum()**0.5)
.reset_index()
.rename(columns={'log_return': 'realized_volatility'})
)
else:
realized_vol = df[['stock_id', 'time_id', 'realized_volatility']].drop_duplicates()
# Merge
tradeoff_df = pd.merge(mean_return, realized_vol, on=['stock_id', 'time_id'], how='inner')
print("[3/4] Creating hover labels...")
tradeoff_df['label'] = tradeoff_df.apply(
lambda row: f"Stock: {int(row['stock_id'])}<br>Time ID: {int(row['time_id'])}"
f"<br>Mean Log Return: {row['mean_log_return']:.6f}"
f"<br>Realized Volatility: {row['realized_volatility']:.6f}",
axis=1
)
print("[4/4] Plotting interactive scatter plot...")
fig = px.scatter(
tradeoff_df,
x='realized_volatility',
y='mean_log_return',
color='stock_id',
hover_name='label',
labels={
"realized_volatility": "Volatility (Std Dev of Log Returns)",
"mean_log_return": "Mean Log Return (Expected Growth)"
},
title="Growth vs. Volatility Tradeoff by Stock and Time Window"
)
fig.update_layout(
width=950,
height=650,
showlegend=False
)
fig.show()
return tradeoff_df
log_return_stats = growth_vs_volatility(df_combined)
[1/4] Checking if 'log_return' exists... [2/4] Grouping by stock_id and time_id to compute mean log return and realized volatility... [3/4] Creating hover labels... [4/4] Plotting interactive scatter plot...
# X-axis: Volatility (Standard deviation of log returns)
# Y-axis: Mean log return — expected growth
# Color (stock_id): Differentiates stocks using a gradient (yellow = higher stock_id, dark = lower)
Most data points cluster around low volatility and near-zero return (expected in high-frequency finance).
Some outliers exist with:
Very high volatility and either very low or mildly high return
A tiny handful with slightly positive return + low volatility (these are gold!)
# See the actual distribution first
print(log_return_stats['mean_log_return'].describe())
print(log_return_stats['realized_volatility'].describe())
count 1.158000e+04 mean 1.316352e-08 std 1.602670e-06 min -1.109434e-05 25% -7.493441e-07 50% 2.790223e-08 75% 7.702920e-07 max 1.992465e-05 Name: mean_log_return, dtype: float64 count 11580.000000 mean 0.006015 std 0.003741 min 0.000520 25% 0.003372 50% 0.005364 75% 0.007770 max 0.043795 Name: realized_volatility, dtype: float64
golden_points = log_return_stats[
(log_return_stats['mean_log_return'] > log_return_stats['mean_log_return'].quantile(0.95)) &
(log_return_stats['realized_volatility'] < log_return_stats['realized_volatility'].quantile(0.25))
]
print("Top candidates:")
print(golden_points.sort_values(by='mean_log_return', ascending=False).head(10))
Top candidates:
stock_id time_id mean_log_return realized_volatility \
9611 50200 365 0.000003 0.00315
label
9611 Stock: 50200<br>Time ID: 365<br>Mean Log Retur...
import plotly.graph_objects as go
fig = px.scatter(
log_return_stats,
x='realized_volatility',
y='mean_log_return',
color='stock_id',
hover_data=['stock_id', 'time_id', 'mean_log_return', 'realized_volatility'],
title="Growth vs. Volatility Tradeoff by Stock and Time Window"
)
# Overlay golden points
fig.add_trace(go.Scatter(
x=golden_points['realized_volatility'],
y=golden_points['mean_log_return'],
mode='markers',
marker=dict(color='limegreen', size=50, symbol='star'),
name='Best Tradeoffs',
text=golden_points['label'],
hoverinfo='text'
))
fig.update_layout(width=950, height=650)
fig.show()
This is a top candidate for a potentially valuable trading interval:
Positive expected growth (compounded return over the hour)
Lower-than-average volatility (less risk)
df_filtered = df_combined[(df_combined['stock_id'] == 50200) & (df_combined['time_id'] == 365)]
df_filtered
| stock_id | time_id | seconds_in_bucket | bid_price1 | ask_price1 | bid_price2 | ask_price2 | bid_size1 | ask_size1 | bid_size2 | ask_size2 | wap | BidAskSpread | spread | log_return | realized_volatility | time_bucket | imbalance | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 28472547 | 50200 | 365 | 0.0 | 392.24 | 392.25 | 392.23 | 392.26 | 1300 | 200 | 500 | 601 | 392.248667 | 0.000025 | 0.01 | 0.000000 | 0.00315 | 0 | 0.733333 |
| 28472548 | 50200 | 365 | 1.0 | 392.22 | 392.23 | 392.21 | 392.24 | 300 | 128 | 1400 | 600 | 392.227009 | 0.000025 | 0.01 | -0.000055 | 0.00315 | 1 | 0.401869 |
| 28472549 | 50200 | 365 | 2.0 | 392.22 | 392.23 | 392.21 | 392.24 | 400 | 328 | 1500 | 500 | 392.225495 | 0.000025 | 0.01 | -0.000004 | 0.00315 | 1 | 0.098901 |
| 28472550 | 50200 | 365 | 3.0 | 392.22 | 392.23 | 392.21 | 392.24 | 100 | 400 | 1500 | 500 | 392.222000 | 0.000025 | 0.01 | -0.000009 | 0.00315 | 1 | -0.600000 |
| 28472551 | 50200 | 365 | 4.0 | 392.25 | 392.26 | 392.24 | 392.27 | 300 | 500 | 1400 | 300 | 392.253750 | 0.000025 | 0.01 | 0.000081 | 0.00315 | 1 | -0.250000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 28476142 | 50200 | 365 | 3595.0 | 396.20 | 396.22 | 396.19 | 396.23 | 800 | 200 | 1700 | 1400 | 396.216000 | 0.000050 | 0.02 | -0.000460 | 0.00315 | 120 | 0.600000 |
| 28476143 | 50200 | 365 | 3596.0 | 396.11 | 396.12 | 396.10 | 396.13 | 300 | 1400 | 800 | 1900 | 396.111765 | 0.000025 | 0.01 | -0.000263 | 0.00315 | 120 | -0.647059 |
| 28476144 | 50200 | 365 | 3597.0 | 396.12 | 396.13 | 396.11 | 396.14 | 300 | 1200 | 700 | 700 | 396.122000 | 0.000025 | 0.01 | 0.000026 | 0.00315 | 120 | -0.600000 |
| 28476145 | 50200 | 365 | 3598.0 | 395.93 | 395.94 | 395.92 | 395.95 | 100 | 700 | 600 | 500 | 395.931250 | 0.000025 | 0.01 | -0.000482 | 0.00315 | 120 | -0.750000 |
| 28476146 | 50200 | 365 | 3599.0 | 395.84 | 395.85 | 395.83 | 395.86 | 500 | 1200 | 2763 | 3000 | 395.842941 | 0.000025 | 0.01 | -0.000223 | 0.00315 | 120 | -0.411765 |
3600 rows × 18 columns
import matplotlib.pyplot as plt
def plot_golden_window(df_filtered, stock_id, time_id):
fig, axs = plt.subplots(3, 1, figsize=(14, 10), sharex=True)
fig.suptitle(f"Stock {stock_id} | Time ID {time_id} — Golden Window", fontsize=16, weight='bold')
# 1. WAP over time
axs[0].plot(df_filtered['seconds_in_bucket'], df_filtered['wap'], color='blue')
axs[0].set_ylabel("WAP Price")
axs[0].set_title("WAP Price Evolution")
# 2. Log return over time
axs[1].plot(df_filtered['seconds_in_bucket'], df_filtered['log_return'], color='orange')
axs[1].set_ylabel("Log Return")
axs[1].set_title("Log Return Dynamics")
# 3. Cumulative Volatility (Realized)
df_filtered['cum_sq_return'] = (df_filtered['log_return'] ** 2).cumsum()
df_filtered['realized_vol'] = df_filtered['cum_sq_return'] ** 0.5
axs[2].plot(df_filtered['seconds_in_bucket'], df_filtered['realized_vol'], color='green')
axs[2].set_ylabel("Realized Volatility")
axs[2].set_xlabel("Seconds in Bucket")
axs[2].set_title("Volatility Build-Up")
plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()
plot_golden_window(df_filtered, stock_id=50200, time_id=365)
C:\Users\ryatu\AppData\Local\Temp\ipykernel_27444\1832036949.py:19: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy C:\Users\ryatu\AppData\Local\Temp\ipykernel_27444\1832036949.py:20: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
3. Growth Rate vs Volatility Tradeoff¶
# 3. Liquidity and Noise Analysis (using trades.csv)
# Check:
# Order count per second/minute → proxy for liquidity
# Trade size distribution → bigger trades = more institutional interest
# Price movement vs volume spikes → does volatility correlate with trade bursts?
# Hypothesis: Periods of low liquidity may have higher realized volatility
import matplotlib.pyplot as plt
import seaborn as sns
def analyze_liquidity_and_noise(trades_df, log_return_df, stock_id, time_id):
# 1. Filter data for the specific stock_id and time_id
tdf = trades_df[(trades_df['stock_id'] == stock_id) & (trades_df['time_id'] == time_id)].sort_values('seconds_in_bucket')
rdf = log_return_df[(log_return_df['stock_id'] == stock_id) & (log_return_df['time_id'] == time_id)].sort_values('seconds_in_bucket')
if tdf.empty:
print(f"No trades found for stock_id={stock_id}, time_id={time_id}.")
return
fig, axs = plt.subplots(4, 1, figsize=(14, 18), sharex=False)
fig.suptitle(f"Liquidity and Noise Analysis – Stock {stock_id}, Time ID {time_id}", fontsize=20, weight='bold')
# 2. Plot Order Count (Liquidity Proxy)
axs[0].plot(tdf['seconds_in_bucket'], tdf['order_count'], marker='o', linestyle='-', color='firebrick', alpha=0.7)
axs[0].set_ylabel("Order Count")
axs[0].set_title("Order Count Over Time")
axs[0].grid(True)
# 3. Trade Size Distribution (Histogram)
sns.histplot(tdf['size'], bins=40, kde=False, ax=axs[1], color='purple')
axs[1].set_title("Trade Size Distribution")
axs[1].set_xlabel("Trade Size")
axs[1].set_ylabel("Frequency")
axs[1].grid(True)
# 4. Trade Price vs Volume Spikes
axs[2].plot(tdf['seconds_in_bucket'], tdf['price'], color='black', label='Trade Price')
axs[2].scatter(tdf['seconds_in_bucket'], tdf['price'],
s=(tdf['size'] / tdf['size'].max()) * 300, # Normalize marker size nicely
alpha=0.5, color='dodgerblue', label='Trade Size (scaled)')
axs[2].legend()
axs[2].set_ylabel("Trade Price")
axs[2].set_title("Price Movement Vs Volume Spike")
axs[2].grid(True)
# 5. Rolling Volatility (from log returns)
window_size = 30 # seconds
rdf['rolling_vol'] = rdf['log_return'].rolling(window=window_size, min_periods=5).std()
axs[3].plot(rdf['seconds_in_bucket'], rdf['rolling_vol'], color='seagreen')
axs[3].set_ylabel("Rolling Volatility")
axs[3].set_xlabel("Seconds in Bucket")
axs[3].set_title(f"Rolling Volatility (Window = {window_size}s)")
axs[3].grid(True)
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()
analyze_liquidity_and_noise(trades, df_combined, stock_id=50200, time_id=365)
# golden point best peroformer (stock_id = 50200, time_id = 365) came from order_book.csv
# Low realized volatility
4. Microstructure Noise- Compare order book price movement vs trade price movement¶
# Compare order book price movement (trades) vs trade price movement (feature+ target)
# Order book shows quotes (intentions)
# Trades show executed prices (real transactions)
# Are there differences? (Bid-ask bounce effect?)
# Higher noise usually implies overestimated volatility at very short timeframes
def detect_trade_direction(trades_df, orderbook_df, stock_id, time_id):
# 1. Filter data
tdf = trades_df[(trades_df['stock_id'] == stock_id) & (trades_df['time_id'] == time_id)].sort_values('seconds_in_bucket')
odf = orderbook_df[(orderbook_df['stock_id'] == stock_id) & (orderbook_df['time_id'] == time_id)].sort_values('seconds_in_bucket')
if tdf.empty or odf.empty:
print(f"No matching trades or order book data for stock_id={stock_id}, time_id={time_id}.")
return
# 2. Merge trades with order book data to get bid/ask prices
merged = pd.merge_asof(
tdf[['seconds_in_bucket', 'price', 'size']],
odf[['seconds_in_bucket', 'bid_price1', 'ask_price1']],
on='seconds_in_bucket',
direction='nearest',
tolerance=1
).dropna()
# 3. Classify trade side
def classify_trade(row):
if abs(row['price'] - row['ask_price1']) <= abs(row['price'] - row['bid_price1']):
return 'Buy' # Closer to ask → Buyer aggressive
else:
return 'Sell' # Closer to bid → Seller aggressive
merged['trade_side'] = merged.apply(classify_trade, axis=1)
return merged # return full labeled trades
labeled_trades = detect_trade_direction(trades, df_combined, stock_id=50200, time_id=365)
print(labeled_trades[['seconds_in_bucket', 'price', 'bid_price1', 'ask_price1', 'trade_side']].head())
seconds_in_bucket price bid_price1 ask_price1 trade_side 0 1.0 392.239910 392.22 392.23 Buy 1 2.0 392.199259 392.22 392.23 Sell 2 3.0 392.234673 392.22 392.23 Buy 3 4.0 392.240000 392.25 392.26 Sell 4 5.0 392.260000 392.25 392.26 Buy
def measure_microstructure_noise(labeled_trades):
# Compute mid-point
labeled_trades['midpoint'] = (labeled_trades['bid_price1'] + labeled_trades['ask_price1']) / 2
# Absolute deviation from fair price
labeled_trades['abs_deviation'] = (labeled_trades['price'] - labeled_trades['midpoint']).abs()
avg_noise = labeled_trades['abs_deviation'].mean()
max_noise = labeled_trades['abs_deviation'].max()
print(f"📈 Average Microstructure Noise (abs deviation): {avg_noise:.6f}")
print(f"🚀 Maximum Microstructure Noise (abs deviation): {max_noise:.6f}")
return labeled_trades
Input any stock and time id
# 1. Detect trade directions
labeled_trades = detect_trade_direction(trades, df_combined, stock_id=50200, time_id=365)
# 3. Measure noise
measure_microstructure_noise(labeled_trades)
📈 Average Microstructure Noise (abs deviation): 0.008878 🚀 Maximum Microstructure Noise (abs deviation): 0.050652
| seconds_in_bucket | price | size | bid_price1 | ask_price1 | trade_side | midpoint | abs_deviation | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1.0 | 392.239910 | 223.0 | 392.22 | 392.23 | Buy | 392.225 | 0.014910 |
| 1 | 2.0 | 392.199259 | 1296.0 | 392.22 | 392.23 | Sell | 392.225 | 0.025741 |
| 2 | 3.0 | 392.234673 | 428.0 | 392.22 | 392.23 | Buy | 392.225 | 0.009673 |
| 3 | 4.0 | 392.240000 | 101.0 | 392.25 | 392.26 | Sell | 392.255 | 0.015000 |
| 4 | 5.0 | 392.260000 | 200.0 | 392.25 | 392.26 | Buy | 392.255 | 0.005000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1585 | 1794.0 | 392.840000 | 400.0 | 392.83 | 392.84 | Buy | 392.835 | 0.005000 |
| 1586 | 1795.0 | 392.850000 | 2706.0 | 392.85 | 392.86 | Sell | 392.855 | 0.005000 |
| 1587 | 1796.0 | 392.860371 | 727.0 | 392.86 | 392.87 | Sell | 392.865 | 0.004629 |
| 1588 | 1797.0 | 392.863151 | 1606.0 | 392.84 | 392.85 | Buy | 392.845 | 0.018151 |
| 1589 | 1799.0 | 392.850000 | 805.0 | 392.84 | 392.85 | Buy | 392.845 | 0.005000 |
1590 rows × 8 columns
# Green Line Order Book WAP (Fair Mid-point between Bid and Ask)
# Green Dots Buy Trades (aggressive buyers at ask)
# Red Dots Sell Trades (aggressive sellers at bid)
# Gray Dots Neutral or mid-market trades
def plot_trade_directions(labeled_trades, stock_id, time_id):
fig, ax = plt.subplots(figsize=(14, 7))
fig.suptitle(f"Trade Directions – Stock {stock_id}, Time ID {time_id}", fontsize=18, weight='bold')
colors = {'Buy': 'limegreen', 'Sell': 'red'}
# Plot Buy and Sell separately
for side, group in labeled_trades.groupby('trade_side'):
ax.scatter(
group['seconds_in_bucket'], group['price'],
s=(group['size'] / group['size'].max()) * 300,
color=colors.get(side, 'gray'),
label=f"{side} Trades",
alpha=0.6
)
ax.set_xlabel("Seconds in Bucket")
ax.set_ylabel("Trade Price")
ax.legend(title="Trade Side", fontsize=12, title_fontsize=14)
ax.grid(True)
plt.show()
Input any stock and time id
# First detect trade directions
labeled_trades = detect_trade_direction(trades, df_combined, stock_id=50200, time_id=365)
# Then plot trade directions
plot_trade_directions(labeled_trades, stock_id=50200, time_id=365)
5. Time of Day Effects (using time_id_reference.csv)¶
# 5. Time of Day Effects (using time_id_reference.csv)
# Map time_id → actual datetime == Map time_id to real time (using time_id_reference.csv)
# Investigate if volatility changes by time of day (e.g., around open or close)
# Merge this info with your volatility calculations (realized_volatility, mean_log_return, etc.)
# # Morning and afternoon might have very different dynamics
# is vola higher in certain time
# does it spike ; follow pattern?? garch param
# Merge it with your log_return_stats
log_return_stats = log_return_stats.merge(time_id_reference, on='time_id', how='left')
# Convert to datetime for easier handling
log_return_stats['datetime'] = pd.to_datetime(log_return_stats['date'] + ' ' + log_return_stats['time'])
# Extract hour from datetime
log_return_stats['hour'] = log_return_stats['datetime'].dt.hour
log_return_stats
| stock_id | time_id | mean_log_return | realized_volatility | label | date | time | datetime | hour | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 8382 | 6 | -3.316286e-06 | 0.011885 | Stock: 8382<br>Time ID: 6<br>Mean Log Return: ... | NaN | NaN | NaT | NaN |
| 1 | 8382 | 7 | -1.785381e-06 | 0.009352 | Stock: 8382<br>Time ID: 7<br>Mean Log Return: ... | NaN | NaN | NaT | NaN |
| 2 | 8382 | 8 | 4.414603e-08 | 0.009113 | Stock: 8382<br>Time ID: 8<br>Mean Log Return: ... | NaN | NaN | NaT | NaN |
| 3 | 8382 | 9 | 7.453871e-07 | 0.006929 | Stock: 8382<br>Time ID: 9<br>Mean Log Return: ... | NaN | NaN | NaT | NaN |
| 4 | 8382 | 10 | 9.269810e-07 | 0.007495 | Stock: 8382<br>Time ID: 10<br>Mean Log Return:... | NaN | NaN | NaT | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 11575 | 104919 | 1195 | 3.046436e-07 | 0.001839 | Stock: 104919<br>Time ID: 1195<br>Mean Log Ret... | 2021-10-07 | 12:00:00 | 2021-10-07 12:00:00 | 12.0 |
| 11576 | 104919 | 1196 | -2.271572e-07 | 0.001595 | Stock: 104919<br>Time ID: 1196<br>Mean Log Ret... | 2021-10-07 | 13:00:00 | 2021-10-07 13:00:00 | 13.0 |
| 11577 | 104919 | 1197 | -4.365456e-07 | 0.001546 | Stock: 104919<br>Time ID: 1197<br>Mean Log Ret... | 2021-10-07 | 14:00:00 | 2021-10-07 14:00:00 | 14.0 |
| 11578 | 104919 | 1198 | -6.712164e-07 | 0.002143 | Stock: 104919<br>Time ID: 1198<br>Mean Log Ret... | 2021-10-07 | 15:00:00 | 2021-10-07 15:00:00 | 15.0 |
| 11579 | 104919 | 1199 | -8.380999e-07 | 0.002619 | Stock: 104919<br>Time ID: 1199<br>Mean Log Ret... | 2021-10-07 | 16:00:00 | 2021-10-07 16:00:00 | 16.0 |
11580 rows × 9 columns
# Group by hour and compute average realized volatility
hourly_volatility = log_return_stats.groupby('hour')['realized_volatility'].mean().reset_index()
computing the average volatility for each hour of the day (11 AM, 12 PM, etc.)
Regardless of which stock_id the values come from
hourly_volatility
| hour | realized_volatility | |
|---|---|---|
| 0 | 11.0 | 0.008465 |
| 1 | 12.0 | 0.006400 |
| 2 | 13.0 | 0.005394 |
| 3 | 14.0 | 0.005060 |
| 4 | 15.0 | 0.005257 |
| 5 | 16.0 | 0.005517 |
plt.figure(figsize=(12,6))
plt.plot(hourly_volatility['hour'], hourly_volatility['realized_volatility'], marker='o')
plt.title("Average Realized Volatility by Hour of Day")
plt.xlabel("Hour of Day")
plt.ylabel("Average Realized Volatility")
plt.grid(True)
plt.xticks(range(9, 18)) # If your trading day is 9 AM to 5 PM
plt.show()
plt.figure(figsize=(12, 6))
sns.boxplot(data=log_return_stats, x='hour', y='realized_volatility')
plt.title("Realized Volatility Distribution by Hour of Day")
plt.xlabel("Hour")
plt.ylabel("Realized Volatility")
plt.grid(True)
plt.show()
log_return_stats['session'] = log_return_stats['hour'].apply(lambda h: 'Morning' if h < 13 else 'Afternoon')
sns.boxplot(data=log_return_stats, x='session', y='realized_volatility')
plt.title("Volatility: Morning vs Afternoon")
plt.grid(True)
plt.show()
"best performeer"¶
hourly_by_stock = log_return_stats.groupby(['stock_id', 'hour'])['realized_volatility'].mean().reset_index()
stock_50200 = hourly_by_stock[hourly_by_stock['stock_id'] == 50200]
plt.figure(figsize=(12, 6))
plt.plot(stock_50200['hour'], stock_50200['realized_volatility'], marker='o')
plt.title("Volatility by Hour – Stock 50200")
plt.xlabel("Hour")
plt.ylabel("Realized Volatility")
plt.grid(True)
plt.show()
let's find the worst performer¶
# Ensure no NaN values in hour
log_return_stats_clean = log_return_stats.dropna(subset=['hour'])
# Sort by mean log return (lowest = worst)
worst = log_return_stats_clean.sort_values(by='mean_log_return').head(1)
print("📉 Worst Performing Time Window:")
print(worst[['stock_id', 'time_id', 'mean_log_return', 'realized_volatility', 'datetime', 'hour']])
📉 Worst Performing Time Window:
stock_id time_id mean_log_return realized_volatility \
248 8382 266 -0.000011 0.028858
datetime hour
248 2021-03-04 13:00:00 13.0
worst_10 = log_return_stats_clean.sort_values(by='mean_log_return').head(10)
print(worst_10[['stock_id', 'time_id', 'mean_log_return', 'realized_volatility', 'datetime', 'hour']])
stock_id time_id mean_log_return realized_volatility \
248 8382 266 -0.000011 0.028858
627 8382 657 -0.000011 0.015021
259 8382 277 -0.000010 0.023656
110 8382 122 -0.000010 0.023872
84 8382 96 -0.000010 0.016136
541 8382 565 -0.000009 0.017013
2197 9323 1081 -0.000009 0.014557
323 8382 341 -0.000009 0.013916
205 8382 223 -0.000009 0.024754
6588 22771 834 -0.000009 0.012693
datetime hour
248 2021-03-04 13:00:00 13.0
627 2021-06-03 14:00:00 14.0
259 2021-03-08 12:00:00 12.0
110 2021-01-29 13:00:00 13.0
84 2021-01-25 11:00:00 11.0
541 2021-05-13 12:00:00 12.0
2197 2021-09-10 12:00:00 12.0
323 2021-03-22 16:00:00 16.0
205 2021-02-23 12:00:00 12.0
6588 2021-07-15 11:00:00 11.0
hourly_by_stock = log_return_stats.groupby(['stock_id', 'hour'])['realized_volatility'].mean().reset_index()
stock_50200 = hourly_by_stock[hourly_by_stock['stock_id'] == 8382 ]
plt.figure(figsize=(12, 6))
plt.plot(stock_50200['hour'], stock_50200['realized_volatility'], marker='o')
plt.title("Volatility by Hour – Stock 8382 ")
plt.xlabel("Hour")
plt.ylabel("Realized Volatility")
plt.grid(True)
plt.show()
# ── metric functions ──
def rmse(y, yhat):
return np.sqrt(mean_squared_error(y, yhat))
def mape(y, yhat):
return np.mean(np.abs((y - yhat) / y)) * 100
def smape(y, yhat):
return np.mean(2 * np.abs(yhat - y) / (np.abs(y) + np.abs(yhat))) * 100
def qlike(y, yhat):
# guard against zero
mask = (yhat>0) & (y>0)
return np.mean(y[mask]/yhat[mask] - np.log(y[mask]/yhat[mask]) - 1)
METRICS = {
'R2': r2_score,
'MSE': mean_squared_error,
'RMSE': rmse,
'MedAE': median_absolute_error,
'MAPE': mape,
'sMAPE': smape,
'QLIKE': qlike,
}
FEATURES = [
'spread', 'imbalance', 'depth',
'depth_ratio', 'rv_rolling', 'mom_rolling'
]
# ── quiet down warnings ──
warnings.filterwarnings("ignore")
# ── where your splits live ──
BASE = Path(
r"C:/Users/ryatu/Documents/New folder/"
"Data3888_Optiver_9/FINAL_MODELS/splits"
)
DEFAULT_N_CLUSTERS = 3 # change this to whatever you prefer
def evaluate_stock(stock_dir: Path) -> pd.DataFrame:
rows = []
stock_id = stock_dir.name
for week_dir in sorted(stock_dir.glob("week_*")):
print(f" • Week {week_dir.name} …", end=" ")
train = pd.read_csv(week_dir / "train.csv")
test = pd.read_csv(week_dir / "test.csv")
# ── re-cluster on the fly ──
km = KMeans(n_clusters=DEFAULT_N_CLUSTERS, random_state=0)
train['cluster'] = km.fit_predict(train[FEATURES])
test['cluster'] = km.predict(test[FEATURES])
test['pred_vol'] = np.nan
# ── EGARCH(1,1,1) per cluster ──
for cid in sorted(train['cluster'].unique()):
sub = (
train[train['cluster']==cid]
.sort_values(['stock_id','time_id','bucket_start'])
)
rets = sub['log_ret_mean'].dropna().astype(float)
if len(rets) < 50:
continue
mu, sigma = rets.mean(), rets.std()
if sigma == 0 or not np.isfinite(sigma):
continue
rets_std = (rets - mu) / sigma
am = arch_model(
rets_std,
vol="EGARCH", p=1, o=1, q=1,
mean="Zero", dist="t", rescale=False
)
res = am.fit(disp="off", options={"solver":"powell"})
f = res.forecast(horizon=1, reindex=False)
vol_std = np.sqrt(f.variance.values[-1, 0]) * sigma
idx = test[test['cluster']==cid].index
test.loc[idx, 'pred_vol'] = vol_std
# ── compute metrics ──
valid = test[['realized_volatility','pred_vol']].dropna()
for name, fn in METRICS.items():
val = fn(valid['realized_volatility'], valid['pred_vol'])
rows.append({
'stock': stock_id,
'week': week_dir.name,
'metric': name,
'value': val
})
print("done")
return pd.DataFrame(rows)
for stock_dir in sorted(BASE.iterdir()):
if not stock_dir.is_dir():
continue
print(f"\n=== Evaluating stock {stock_dir.name} ===")
df = evaluate_stock(stock_dir)
out = stock_dir / "egarch_metrics.csv"
df.to_csv(out, index=False)
print(f" → Saved {len(df)} rows to {out}")
=== Evaluating stock 104919.0 === doneWeek week_1 … doneWeek week_10 … doneWeek week_11 … doneWeek week_12 … doneWeek week_13 … doneWeek week_14 … doneWeek week_15 … doneWeek week_16 … doneWeek week_17 … doneWeek week_18 … doneWeek week_19 … doneWeek week_2 … doneWeek week_20 … doneWeek week_21 … doneWeek week_22 … doneWeek week_23 … doneWeek week_24 … doneWeek week_25 … doneWeek week_26 … doneWeek week_27 … doneWeek week_28 … doneWeek week_29 … • Week week_3 …
C:\Users\ryatu\anaconda3\Lib\site-packages\arch\univariate\base.py:768: ConvergenceWarning: The optimizer returned code 9. The message is: Iteration limit reached See scipy.optimize.fmin_slsqp for code meaning. warnings.warn(
done doneWeek week_30 … doneWeek week_31 … doneWeek week_32 … doneWeek week_33 … doneWeek week_34 … • Week week_35 …
C:\Users\ryatu\anaconda3\Lib\site-packages\arch\univariate\base.py:768: ConvergenceWarning: The optimizer returned code 9. The message is: Iteration limit reached See scipy.optimize.fmin_slsqp for code meaning. warnings.warn(
done doneWeek week_36 … • Week week_37 …
C:\Users\ryatu\anaconda3\Lib\site-packages\arch\univariate\base.py:768: ConvergenceWarning: The optimizer returned code 9. The message is: Iteration limit reached See scipy.optimize.fmin_slsqp for code meaning. warnings.warn(
done doneWeek week_38 … doneWeek week_4 … doneWeek week_5 … doneWeek week_6 … doneWeek week_7 … doneWeek week_8 … doneWeek week_9 … → Saved 266 rows to C:\Users\ryatu\Documents\New folder\Data3888_Optiver_9\FINAL_MODELS\splits\104919.0\egarch_metrics.csv === Evaluating stock 22675.0 === doneWeek week_1 … doneWeek week_10 … doneWeek week_11 … doneWeek week_12 … • Week week_13 … done • Week week_14 …
C:\Users\ryatu\anaconda3\Lib\site-packages\arch\univariate\base.py:768: ConvergenceWarning: The optimizer returned code 9. The message is: Iteration limit reached See scipy.optimize.fmin_slsqp for code meaning. warnings.warn(
done doneWeek week_15 … doneWeek week_16 … doneWeek week_17 … doneWeek week_18 … doneWeek week_19 … doneWeek week_2 … doneWeek week_20 … doneWeek week_21 … doneWeek week_22 … doneWeek week_23 … doneWeek week_24 … doneWeek week_25 … doneWeek week_26 … doneWeek week_27 … doneWeek week_28 … doneWeek week_29 … doneWeek week_3 … doneWeek week_30 … doneWeek week_31 … doneWeek week_32 … doneWeek week_33 … doneWeek week_34 … doneWeek week_35 … doneWeek week_36 … doneWeek week_37 … doneWeek week_38 … doneWeek week_4 … • Week week_5 …
C:\Users\ryatu\anaconda3\Lib\site-packages\arch\univariate\base.py:768: ConvergenceWarning: The optimizer returned code 9. The message is: Iteration limit reached See scipy.optimize.fmin_slsqp for code meaning. warnings.warn(
done doneWeek week_6 … doneWeek week_7 … doneWeek week_8 … doneWeek week_9 … → Saved 266 rows to C:\Users\ryatu\Documents\New folder\Data3888_Optiver_9\FINAL_MODELS\splits\22675.0\egarch_metrics.csv === Evaluating stock 22729.0 === doneWeek week_1 … doneWeek week_10 … doneWeek week_11 … • Week week_12 …
C:\Users\ryatu\anaconda3\Lib\site-packages\arch\univariate\base.py:768: ConvergenceWarning: The optimizer returned code 9. The message is: Iteration limit reached See scipy.optimize.fmin_slsqp for code meaning. warnings.warn(
done doneWeek week_13 … doneWeek week_14 … doneWeek week_15 … doneWeek week_16 … doneWeek week_17 … doneWeek week_18 … doneWeek week_19 … doneWeek week_2 … doneWeek week_20 … doneWeek week_21 … doneWeek week_22 … doneWeek week_23 … • Week week_24 …
C:\Users\ryatu\anaconda3\Lib\site-packages\arch\univariate\base.py:768: ConvergenceWarning: The optimizer returned code 9. The message is: Iteration limit reached See scipy.optimize.fmin_slsqp for code meaning. warnings.warn(
done • Week week_25 …
C:\Users\ryatu\anaconda3\Lib\site-packages\arch\univariate\base.py:768: ConvergenceWarning: The optimizer returned code 9. The message is: Iteration limit reached See scipy.optimize.fmin_slsqp for code meaning. warnings.warn(
done doneWeek week_26 … doneWeek week_27 … doneWeek week_28 … doneWeek week_29 … doneWeek week_3 … doneWeek week_30 … doneWeek week_31 … doneWeek week_32 … doneWeek week_33 … doneWeek week_34 … doneWeek week_35 … doneWeek week_36 … doneWeek week_37 … doneWeek week_38 … doneWeek week_4 … doneWeek week_5 … doneWeek week_6 … • Week week_7 …
C:\Users\ryatu\anaconda3\Lib\site-packages\arch\univariate\base.py:768: ConvergenceWarning: The optimizer returned code 9. The message is: Iteration limit reached See scipy.optimize.fmin_slsqp for code meaning. warnings.warn(
done doneWeek week_8 … doneWeek week_9 … → Saved 266 rows to C:\Users\ryatu\Documents\New folder\Data3888_Optiver_9\FINAL_MODELS\splits\22729.0\egarch_metrics.csv === Evaluating stock 22753.0 === doneWeek week_1 … doneWeek week_10 … doneWeek week_11 … doneWeek week_12 … doneWeek week_13 … doneWeek week_14 … doneWeek week_15 … doneWeek week_16 … doneWeek week_17 … doneWeek week_18 … • Week week_19 … done doneWeek week_2 … doneWeek week_20 … • Week week_21 … done doneWeek week_22 … • Week week_23 … done doneWeek week_24 … doneWeek week_25 … doneWeek week_26 … doneWeek week_27 … doneWeek week_28 … • Week week_29 …
C:\Users\ryatu\anaconda3\Lib\site-packages\arch\univariate\base.py:768: ConvergenceWarning: The optimizer returned code 9. The message is: Iteration limit reached See scipy.optimize.fmin_slsqp for code meaning. warnings.warn(
done doneWeek week_3 … doneWeek week_30 … • Week week_31 …
C:\Users\ryatu\anaconda3\Lib\site-packages\arch\univariate\base.py:768: ConvergenceWarning: The optimizer returned code 9. The message is: Iteration limit reached See scipy.optimize.fmin_slsqp for code meaning. warnings.warn(
done doneWeek week_32 … doneWeek week_33 … doneWeek week_34 … doneWeek week_35 … doneWeek week_36 … doneWeek week_37 … doneWeek week_38 … doneWeek week_4 … doneWeek week_5 … doneWeek week_6 … doneWeek week_7 … doneWeek week_8 … doneWeek week_9 … → Saved 266 rows to C:\Users\ryatu\Documents\New folder\Data3888_Optiver_9\FINAL_MODELS\splits\22753.0\egarch_metrics.csv === Evaluating stock 22771.0 === doneWeek week_1 … doneWeek week_10 … doneWeek week_11 … • Week week_12 …
C:\Users\ryatu\anaconda3\Lib\site-packages\arch\univariate\base.py:768: ConvergenceWarning: The optimizer returned code 9. The message is: Iteration limit reached See scipy.optimize.fmin_slsqp for code meaning. warnings.warn(
done doneWeek week_13 … doneWeek week_14 … • Week week_15 … done doneWeek week_16 … doneWeek week_17 … doneWeek week_18 … doneWeek week_19 … doneWeek week_2 … doneWeek week_20 … doneWeek week_21 … doneWeek week_22 … doneWeek week_23 … doneWeek week_24 … doneWeek week_25 … doneWeek week_26 … doneWeek week_27 … doneWeek week_28 … doneWeek week_29 … doneWeek week_3 … doneWeek week_30 … doneWeek week_31 … • Week week_32 …
C:\Users\ryatu\anaconda3\Lib\site-packages\arch\univariate\base.py:768: ConvergenceWarning: The optimizer returned code 9. The message is: Iteration limit reached See scipy.optimize.fmin_slsqp for code meaning. warnings.warn(
done doneWeek week_33 … doneWeek week_34 … doneWeek week_35 … • Week week_36 …
C:\Users\ryatu\anaconda3\Lib\site-packages\arch\univariate\base.py:768: ConvergenceWarning: The optimizer returned code 9. The message is: Iteration limit reached See scipy.optimize.fmin_slsqp for code meaning. warnings.warn(
done doneWeek week_37 … doneWeek week_38 … doneWeek week_4 … doneWeek week_5 … doneWeek week_6 … doneWeek week_7 … doneWeek week_8 … doneWeek week_9 … → Saved 266 rows to C:\Users\ryatu\Documents\New folder\Data3888_Optiver_9\FINAL_MODELS\splits\22771.0\egarch_metrics.csv === Evaluating stock 22951.0 === • Week week_1 … done doneWeek week_10 … • Week week_11 … done • Week week_12 …
C:\Users\ryatu\anaconda3\Lib\site-packages\arch\univariate\base.py:768: ConvergenceWarning: The optimizer returned code 4. The message is: Inequality constraints incompatible See scipy.optimize.fmin_slsqp for code meaning. warnings.warn(
done • Week week_13 … done doneWeek week_14 … doneWeek week_15 … doneWeek week_16 … doneWeek week_17 … doneWeek week_18 … doneWeek week_19 … doneWeek week_2 … doneWeek week_20 … • Week week_21 … done doneWeek week_22 … doneWeek week_23 … • Week week_24 …
C:\Users\ryatu\anaconda3\Lib\site-packages\arch\univariate\base.py:768: ConvergenceWarning: The optimizer returned code 9. The message is: Iteration limit reached See scipy.optimize.fmin_slsqp for code meaning. warnings.warn(
done doneWeek week_25 … doneWeek week_26 … doneWeek week_27 … doneWeek week_28 … doneWeek week_29 … doneWeek week_3 … doneWeek week_30 … doneWeek week_31 … doneWeek week_32 … doneWeek week_33 … doneWeek week_34 … doneWeek week_35 … • Week week_36 … done doneWeek week_37 … doneWeek week_38 … doneWeek week_4 … doneWeek week_5 … • Week week_6 … done doneWeek week_7 … • Week week_8 … done doneWeek week_9 … → Saved 266 rows to C:\Users\ryatu\Documents\New folder\Data3888_Optiver_9\FINAL_MODELS\splits\22951.0\egarch_metrics.csv === Evaluating stock 48219.0 === doneWeek week_1 … doneWeek week_10 … doneWeek week_11 … doneWeek week_12 … doneWeek week_13 … doneWeek week_14 … doneWeek week_15 … doneWeek week_16 … doneWeek week_17 … doneWeek week_18 … doneWeek week_19 … • Week week_2 … done doneWeek week_20 … doneWeek week_21 … doneWeek week_22 … doneWeek week_23 … doneWeek week_24 … doneWeek week_25 … doneWeek week_26 … doneWeek week_27 … doneWeek week_28 … doneWeek week_29 … doneWeek week_3 … doneWeek week_30 … doneWeek week_31 … doneWeek week_32 … doneWeek week_33 … doneWeek week_34 … doneWeek week_35 … doneWeek week_36 … doneWeek week_37 … doneWeek week_38 … doneWeek week_4 … • Week week_5 … done doneWeek week_6 … doneWeek week_7 … doneWeek week_8 … doneWeek week_9 … → Saved 266 rows to C:\Users\ryatu\Documents\New folder\Data3888_Optiver_9\FINAL_MODELS\splits\48219.0\egarch_metrics.csv === Evaluating stock 50200.0 === doneWeek week_1 … doneWeek week_10 … • Week week_11 …
C:\Users\ryatu\anaconda3\Lib\site-packages\arch\univariate\base.py:768: ConvergenceWarning: The optimizer returned code 9. The message is: Iteration limit reached See scipy.optimize.fmin_slsqp for code meaning. warnings.warn(
done doneWeek week_12 … • Week week_13 …
C:\Users\ryatu\anaconda3\Lib\site-packages\arch\univariate\base.py:768: ConvergenceWarning: The optimizer returned code 9. The message is: Iteration limit reached See scipy.optimize.fmin_slsqp for code meaning. warnings.warn(
done doneWeek week_14 … doneWeek week_15 … doneWeek week_16 … doneWeek week_17 … • Week week_18 …
C:\Users\ryatu\anaconda3\Lib\site-packages\arch\univariate\base.py:768: ConvergenceWarning: The optimizer returned code 9. The message is: Iteration limit reached See scipy.optimize.fmin_slsqp for code meaning. warnings.warn(
done doneWeek week_19 … doneWeek week_2 … doneWeek week_20 … doneWeek week_21 … doneWeek week_22 … doneWeek week_23 … doneWeek week_24 … • Week week_25 …
C:\Users\ryatu\anaconda3\Lib\site-packages\arch\univariate\base.py:768: ConvergenceWarning: The optimizer returned code 9. The message is: Iteration limit reached See scipy.optimize.fmin_slsqp for code meaning. warnings.warn(
done doneWeek week_26 … doneWeek week_27 … doneWeek week_28 … doneWeek week_29 … doneWeek week_3 … • Week week_30 …
C:\Users\ryatu\anaconda3\Lib\site-packages\arch\univariate\base.py:768: ConvergenceWarning: The optimizer returned code 9. The message is: Iteration limit reached See scipy.optimize.fmin_slsqp for code meaning. warnings.warn(
done doneWeek week_31 … doneWeek week_32 … doneWeek week_33 … doneWeek week_34 … • Week week_35 …
C:\Users\ryatu\anaconda3\Lib\site-packages\arch\univariate\base.py:768: ConvergenceWarning: The optimizer returned code 9. The message is: Iteration limit reached See scipy.optimize.fmin_slsqp for code meaning. warnings.warn(
done • Week week_36 …
C:\Users\ryatu\anaconda3\Lib\site-packages\arch\univariate\base.py:768: ConvergenceWarning: The optimizer returned code 9. The message is: Iteration limit reached See scipy.optimize.fmin_slsqp for code meaning. warnings.warn(
done doneWeek week_37 … doneWeek week_38 … doneWeek week_4 … doneWeek week_5 … doneWeek week_6 … doneWeek week_7 … doneWeek week_8 … doneWeek week_9 … → Saved 266 rows to C:\Users\ryatu\Documents\New folder\Data3888_Optiver_9\FINAL_MODELS\splits\50200.0\egarch_metrics.csv === Evaluating stock 8382.0 === doneWeek week_1 … doneWeek week_10 … • Week week_11 … done doneWeek week_12 … doneWeek week_13 … doneWeek week_14 … • Week week_15 … done doneWeek week_16 … doneWeek week_17 … • Week week_18 … done doneWeek week_19 … doneWeek week_2 … doneWeek week_20 … doneWeek week_21 … doneWeek week_22 … doneWeek week_23 … doneWeek week_24 … doneWeek week_25 … doneWeek week_26 … doneWeek week_27 … doneWeek week_28 … • Week week_29 … done doneWeek week_3 … doneWeek week_30 … doneWeek week_31 … doneWeek week_32 … • Week week_33 … done • Week week_34 …
C:\Users\ryatu\anaconda3\Lib\site-packages\arch\univariate\base.py:768: ConvergenceWarning: The optimizer returned code 9. The message is: Iteration limit reached See scipy.optimize.fmin_slsqp for code meaning. warnings.warn(
done doneWeek week_35 … doneWeek week_36 … • Week week_37 … done doneWeek week_38 … doneWeek week_4 … • Week week_5 … done doneWeek week_6 … • Week week_7 … done doneWeek week_8 … doneWeek week_9 … → Saved 266 rows to C:\Users\ryatu\Documents\New folder\Data3888_Optiver_9\FINAL_MODELS\splits\8382.0\egarch_metrics.csv === Evaluating stock 9323.0 === doneWeek week_1 … doneWeek week_10 … doneWeek week_11 … doneWeek week_12 … doneWeek week_13 … • Week week_14 … done doneWeek week_15 … • Week week_16 … done doneWeek week_17 … • Week week_18 …
C:\Users\ryatu\anaconda3\Lib\site-packages\arch\univariate\base.py:768: ConvergenceWarning: The optimizer returned code 9. The message is: Iteration limit reached See scipy.optimize.fmin_slsqp for code meaning. warnings.warn(
done doneWeek week_19 … doneWeek week_2 … doneWeek week_20 … doneWeek week_21 … doneWeek week_22 … doneWeek week_23 … doneWeek week_24 … doneWeek week_25 … doneWeek week_26 … doneWeek week_27 … doneWeek week_28 … doneWeek week_29 … doneWeek week_3 … doneWeek week_30 … doneWeek week_31 … doneWeek week_32 … doneWeek week_33 … doneWeek week_34 … doneWeek week_35 … doneWeek week_36 … doneWeek week_37 … doneWeek week_38 … doneWeek week_4 … doneWeek week_5 … doneWeek week_6 … doneWeek week_7 … doneWeek week_8 … doneWeek week_9 … → Saved 266 rows to C:\Users\ryatu\Documents\New folder\Data3888_Optiver_9\FINAL_MODELS\splits\9323.0\egarch_metrics.csv
base_dir = Path(r"C:/Users/ryatu/Documents/New folder/Data3888_Optiver_9/FINAL_MODELS/splits")
files = list(base_dir.glob("*/egarch_metrics.csv"))
print(f"Found {len(files)} files:")
for f in files:
print(" •", f.name, "in", f.parent.name)
dfs = [pd.read_csv(f) for f in files]
combined = pd.concat(dfs, ignore_index=True)
out_path = base_dir / "allstocks_egarch_metrics.csv"
combined.to_csv(out_path, index=False)
print(f"\nCombined {len(dfs)} files into {len(combined)} rows.")
print(f"Saved combined metrics to {out_path}")
Found 10 files: • egarch_metrics.csv in 104919.0 • egarch_metrics.csv in 22675.0 • egarch_metrics.csv in 22729.0 • egarch_metrics.csv in 22753.0 • egarch_metrics.csv in 22771.0 • egarch_metrics.csv in 22951.0 • egarch_metrics.csv in 48219.0 • egarch_metrics.csv in 50200.0 • egarch_metrics.csv in 8382.0 • egarch_metrics.csv in 9323.0 Combined 10 files into 2660 rows. Saved combined metrics to C:\Users\ryatu\Documents\New folder\Data3888_Optiver_9\FINAL_MODELS\splits\allstocks_egarch_metrics.csv
result = pd.read_csv(
r"C:\Users\ryatu\Documents\New folder\Data3888_Optiver_9\FINAL_MODELS\splits\allstocks_egarch_metrics.csv"
)
result['week_num'] = result['week'].str.extract(r'week_(\d+)').astype(int)
metric_order = ['R2','MSE','RMSE','MedAE','MAPE','sMAPE','QLIKE']
result['metric'] = pd.Categorical(
result['metric'],
categories=metric_order,
ordered=True
)
result = result.sort_values(
['stock','week_num','metric']
).reset_index(drop=True)
# 5) Drop the helper column
result = result.drop(columns=['week_num'])
# 6) Save out
result.to_csv(
r"C:\Users\ryatu\Documents\New folder\Data3888_Optiver_9\FINAL_MODELS\splits\allstocks_egarch_metrics_sorted.csv",
index=False
)
# 7) Quick sanity check
print(result.head(14))
stock week metric value 0 8382.0 week_1 R2 -4.949554e+00 1 8382.0 week_1 MSE 1.257301e-06 2 8382.0 week_1 RMSE 1.121294e-03 3 8382.0 week_1 MedAE 9.385326e-04 4 8382.0 week_1 MAPE 9.588906e+01 5 8382.0 week_1 sMAPE 1.843218e+02 6 8382.0 week_1 QLIKE 2.487207e+01 7 8382.0 week_2 R2 -3.704938e+00 8 8382.0 week_2 MSE 3.308309e-07 9 8382.0 week_2 RMSE 5.751790e-04 10 8382.0 week_2 MedAE 4.431064e-04 11 8382.0 week_2 MAPE 9.493274e+01 12 8382.0 week_2 sMAPE 1.809359e+02 13 8382.0 week_2 QLIKE 2.080013e+01
result.head()
| stock | week | metric | value | |
|---|---|---|---|---|
| 0 | 8382.0 | week_1 | R2 | -4.949554 |
| 1 | 8382.0 | week_1 | MSE | 0.000001 |
| 2 | 8382.0 | week_1 | RMSE | 0.001121 |
| 3 | 8382.0 | week_1 | MedAE | 0.000939 |
| 4 | 8382.0 | week_1 | MAPE | 95.889059 |
A few gotchas to keep in mind:
Zero-volatility bins If your 30 min realized volatility (realized_volatility) ever hits zero you’ll blow up MAPE/sMAPE or QLIKE (division by zero / –log(0)). You may need to filter those out or add a tiny epsilon.
Static vs. rolling forecast In your loop you fit once on the entire training cluster and then assign that single 1-step forecast to all test rows in the same cluster. If you really want a “streaming” prediction you’ll need to re-fit (or update) the EGARCH model on a rolling window and forecast at each 30 min boundary.
Standardization leakage Make sure your sigma used to back-scale the forecast is computed only on the train slice — never include any test data in that standard deviation.
Solver choice Powell is robust but slow. Once you have working code, try solver='lbfgs' or solver='nm' for a big speedup.
Distributional assumptions You’re using Student-t; if you see persistent skew you might try dist='skewt', or even compare to
df = pd.read_csv(base + "test1_30s_forecasted.csv")
print("Predicted vol range:", df['predicted_volatility'].min(), "–", df['predicted_volatility'].max())
print("New MSE:", mean_squared_error(df['realized_volatility'], df['predicted_volatility']))
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[13], line 1 ----> 1 df = pd.read_csv(base + "test1_30s_forecasted.csv") 2 print("Predicted vol range:", df['predicted_volatility'].min(), "–", df['predicted_volatility'].max()) 3 print("New MSE:", mean_squared_error(df['realized_volatility'], df['predicted_volatility'])) NameError: name 'base' is not defined
base = r"C:/Users/ryatu/Documents/New folder/Data3888_Optiver_9/PRI/"
df = pd.read_csv(base + "test1_30s_forecasted.csv")
# Overall MSE
mse_overall = mean_squared_error(
df['realized_volatility'],
df['predicted_volatility']
)
print(f"Overall MSE: {mse_overall:.8f} ({mse_overall:.3e})\n")
# MSE by cluster
for cid, sub in df.groupby('cluster'):
mse_c = mean_squared_error(
sub['realized_volatility'],
sub['predicted_volatility']
)
print(f"Cluster {cid}: MSE = {mse_c:.8f} ({mse_c:.3e})")
base = "C:/Users/ryatu/Documents/New folder/Data3888_Optiver_9/PRI/"
# 1) Load the forecasted CSV
df = pd.read_csv(base + "test1_30s_forecasted.csv")
# 2) Quick peek at the columns and head
print(df.columns)
print(df.head())
# 3) If you have realized_volatility, compute MSE again
if 'realized_volatility' in df:
mse = mean_squared_error(df['realized_volatility'], df['predicted_volatility'])
print(f"Recomputed Overall MSE: {mse:.6f}")
# Replace with the actual file paths to your parquet files
df_30s = pd.read_parquet('train1_30s_clustered.parquet')
df_30s_train = pd.read_parquet('centroidData.parquet')
df_30s_test = pd.read_parquet('test1_30s.parquet')
# Sanity check
df_30s['group'] = df_30s['stock_id'].astype(str) + '_' + df_30s['week'].astype(str)
print("df_30s:", df_30s.shape)
df_30s.head()
df_30s_test['group'] = df_30s_test['stock_id'].astype(str) + '_' + df_30s_test['week'].astype(str)
print("df_30s_test:", df_30s_test.shape)
df_30s_test.head()
# — 2. pull out all the groups in df_30s that are cluster 0 —
cluster0_groups = df_30s.loc[df_30s['cluster'] == 0, 'group'].unique()
# — 3. filter your test frame to those groups —
df_test_cluster0 = df_30s_test[df_30s_test['group'].isin(cluster0_groups)].copy()
# — 4. sort by group then by your time index to retain temporal order —
df_test_cluster0.sort_values(['group', 'bucket_start'], inplace=True)
df_test_cluster0.reset_index(drop=True, inplace=True)
# now df_test_cluster0 is your time-ordered test set for only cluster 0
print(df_test_cluster0.shape)
df_test_cluster0.head()
df_train_cluster0 = df_30s_train[df_30s_train['cluster'] == 0].copy()
df_train_cluster0.sort_values(['day_of_week','bucket_start'], inplace=True)
df_train_cluster0